In [1]:
import altair as alt
import pandas as pd
from vega_datasets import data
# import vega
import geopandas as gpd
In [2]:
path = 'datasets/crimedata.csv'
crime_data = pd.read_csv(path)
crime_data.head()
Out[2]:
Unnamed: 0 Neighbourhood Total - Age groups and average age of the population - 100% data 0 to 14 years...3 0 to 4 years...4 5 to 9 years...5 10 to 14 years...6 15 to 64 years...7 15 to 19 years...8 20 to 24 years...9 ... MONTH DAY HOUR MINUTE HUNDRED_BLOCK X Y Population density Average cost of house in neighbour Average income
0 1 Arbutus-Ridge 15295.0 2015.0 455.0 685.0 880.0 9805.0 1230.0 1165.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2 Downtown 62030.0 4000.0 2080.0 1105.0 810.0 51275.0 1180.0 4050.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 3 Dunbar-Southlands 21425.0 3545.0 675.0 1225.0 1650.0 14215.0 1800.0 1740.0 ... 11.0 15.0 14.0 30.0 29XX W 31ST AVE 487516.1816 5454623.638 NaN NaN NaN
3 4 Dunbar-Southlands 21425.0 3545.0 675.0 1225.0 1650.0 14215.0 1800.0 1740.0 ... 7.0 11.0 21.0 0.0 29XX W 31ST AVE 487579.6067 5454613.684 NaN NaN NaN
4 5 Dunbar-Southlands 21425.0 3545.0 675.0 1225.0 1650.0 14215.0 1800.0 1740.0 ... 4.0 25.0 21.0 54.0 29XX W 33RD AVE 487585.2638 5454405.082 NaN NaN NaN

5 rows × 269 columns

View 1 - Trend of Crime linked to Crime by Month (Uni-Directional Linking)¶

Task(s) Addressed:¶

  1. What is the trend of total crime over time by year? (Trend)
  2. What is the Distribution of crime each Year by Month? (Characterize Distribution)
In [3]:
#Creating the desired dataframe
# Discard year 2023 as it does not have complete data (current year)
df = crime_data[crime_data['YEAR'] < 2023]
#Combine different TYPEs into a couple similar types
df_ym = df.groupby(['YEAR', 'MONTH']).count()
df_ym = df_ym.rename(columns={"Unnamed: 0": "Count",})
df_ym = pd.DataFrame(df_ym.to_records())
# Add a new column with the month names
df_ym["MONTH"] = df_ym['MONTH'].apply(lambda x: pd.Timestamp(year=2000, month=int(x), day=1).strftime('%b'))
df_ym.head()
Out[3]:
YEAR MONTH Count Neighbourhood Total - Age groups and average age of the population - 100% data 0 to 14 years...3 0 to 4 years...4 5 to 9 years...5 10 to 14 years...6 15 to 64 years...7 ... TYPE DAY HOUR MINUTE HUNDRED_BLOCK X Y Population density Average cost of house in neighbour Average income
0 2003.0 Jan 5043 5042 3740 3740 3740 3740 3740 3740 ... 5043 5043 5043 5043 5043 5042 5042 0 0 0
1 2003.0 Feb 4250 4250 3283 3283 3283 3283 3283 3283 ... 4250 4250 4250 4250 4250 4250 4250 0 0 0
2 2003.0 Mar 4665 4658 3583 3583 3583 3583 3583 3583 ... 4665 4665 4665 4665 4665 4658 4658 0 0 0
3 2003.0 Apr 4895 4894 3714 3714 3714 3714 3714 3714 ... 4895 4895 4895 4895 4895 4894 4894 0 0 0
4 2003.0 May 5439 5431 4057 4057 4057 4057 4057 4057 ... 5439 5439 5439 5439 5438 5431 5431 0 0 0

5 rows × 269 columns

In [4]:
# Selector
selector = alt.selection_single(fields=['YEAR'])

# Create the Base Graph
base = alt.Chart(df_ym).properties(
    width=350,
    height=300,
).add_selection(selector)

# Create the Line Chart
lines = base.mark_line().encode(
    x=alt.X('YEAR:O', title="Year"),
    y=alt.Y('sum(Count):Q', title="Number of Crimes"),
    opacity=alt.condition(selector, alt.value(1), alt.value(0.1)),
    tooltip=[
        alt.Tooltip('YEAR'),
        alt.Tooltip('sum(Count)')
    ]
)
lines = lines.mark_line(point=True,color='orange').encode(
    opacity=alt.condition(selector, alt.value(1), alt.value(0.25))
).add_selection(selector)

# Create the Bar Chart
bar = base.mark_bar(opacity=1, thickness=100).encode(
    x=alt.X('MONTH:O', axis=alt.Axis(labelAngle=-30), title="Month", sort="-y"),
    y=alt.Y('sum(Count)', title="Crime by Month"),
    tooltip=[
        alt.Tooltip('MONTH:O', title="Month"),
        alt.Tooltip('sum(Count)', title='Incedents of Crime'),
        #alt.Tooltip('YEAR', title='Year')
    ]

).transform_filter(  # Add this method to filter the data based on the selected year
    selector
)

complete = (lines | bar).properties(
    title="Trend of Crime in Vancouver Linked to Crime by Month"
).configure_point(
    size=75
)
complete
Out[4]:

View 2 - Scatter Plot of Mean Age and Total Population by Neighbourhood linked to a Stacked Bar Chart of Distribution of Type of Crime by Neighborhoods Near Downtown (Bi-directional Linking)¶

Task(s) Addressed:¶

  1. What is the frequency and distribution of the types of crime that are observed in neighbourhoods in downtown Vancouver?
  2. How do the average ages of the population and total populations of each of the neighbourhoods in downtown Vancouver compare, and are they related to the types of crime that are observed in these neighbourhoods?
In [5]:
# Used https://altair-viz.github.io/gallery/scatter_with_layered_histogram.html as a resource
# Discard year 2023 as it does not have complete data (current year)
df = crime_data[crime_data['YEAR'] < 2023]

#Combine different TYPEs into a couple similar types
df = df.replace('Break and Enter Commercial','Break And Enter')
df = df.replace('Break and Enter Residential/Other','Break And Enter')
df = df.replace('Homicide','Offence Against a Person')
df = df.replace('Other Theft','Other Theft')
df = df.replace('Theft from Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Bicycle','Vehicle Related Theft')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Fatality)','Traffic Accident')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Injury)','Traffic Accident')

# Look at subsection of neighbourhoods in Downtown
neighbourhoods = ['Strathcona', 'Grandview-Woodland', 'Hastings-Sunrise', 'Mount Pleasant', 'Fairview']

df2 = df.loc[df['Neighbourhood'].isin(neighbourhoods)] 
In [6]:
# Second Vis:

# Select Neighbourhood, Mean Age, Total Population, and Type of Crime
df = df2.groupby(['Neighbourhood', 'TYPE', 'Average age of the population',
                   "Total - Age groups and average age of the population - 100% data"]).count()
df = df.rename(columns={"Unnamed: 0": "Count",})
df = pd.DataFrame(df.to_records())

df.head()

# Selector
selector = alt.selection_single(fields = ['Neighbourhood'])

#Create the Base Graph 
base = alt.Chart(df).properties(
    width=300,
    height=250,
).add_selection(selector)

# Create the ScatterPlot
points = base.mark_circle(size = 200).encode(
    x=alt.X('Average age of the population', scale = alt.Scale(domain = [30, 55]), title = "Average Age of the Population"),
    y=alt.Y('Total - Age groups and average age of the population - 100% data',
            scale = alt.Scale(domain = [10000, 40000]), title = "Total Population"),
    color='Neighbourhood:N',
    opacity=alt.condition(selector, alt.value(1), alt.value(0.1)),
    tooltip = [
        alt.Tooltip('Average age of the population'), 
        alt.Tooltip('Neighbourhood:N'), 
        alt.Tooltip('Total - Age groups and average age of the population - 100% data:N', 
                    title = 'Total Population in the Neighbourhood')
    ]   
)

#Create the Bar Chart
bar = base.mark_bar(opacity=1, thickness=100).encode(
    x=alt.X('TYPE:N', axis=alt.Axis(labelAngle=-30), title = "Type of Crime"),
    y=alt.Y('sum(Count)', title = "Incidents of Crime"),
    color= 'Neighbourhood:N',
    opacity=alt.condition(selector, alt.value(1), alt.value(0.25)),
    tooltip = [
        alt.Tooltip('TYPE', title = "Type of Crime"), 
        alt.Tooltip('Neighbourhood:N'), 
        alt.Tooltip('sum(Count)', 
                    title = 'Incedents of Crime')
    ]   
)

# Combine the two visualizations 
complete = (points | bar).properties(
    title = "Age to Population Scatterplot Linked to Crime For Each Type Stacked Bar Chart by Neighbourhood (Downtown area)"
)
complete
Out[6]:

View 3 - Distribution of Crime in Vancouver¶

Task(s) Addressed:¶

  1. What are the neighbourhoods that have historically had the most amount of crime over the period of the dataset?
  2. How does the total crime of a neighbourhood in the dataset compare with the total crime of neighbourhoods geographically near it and far from it?
In [7]:
import altair as alt
from vega_datasets import data
import requests
import json

vancouver_url = 'https://opendata.vancouver.ca/explore/dataset/local-area-boundary/download/?format=geojson&timezone=America/Los_Angeles'
In [8]:
df = crime_data['Neighbourhood'].value_counts()
df = df.to_frame()
df = df.rename(columns={"Neighbourhood": "count"})
df = df.rename_axis('Neighbourhood').reset_index()
gdf = gpd.read_file(vancouver_url)

gdf = gdf.rename(columns={'name': 'Neighbourhood'})
gdf = gdf.merge(df, on='Neighbourhood')
neighbours = gdf['Neighbourhood'].unique() # get unique field values

selectNeighbourhood = alt.selection_single(
    name='Select', # name the selection 'Select'
    fields=['Neighbourhood'], # limit selection to the Major_Genre field
    init={'Neighbourhood': neighbours[0]}, # use first genre entry as initial value
    bind=alt.binding_select(options=neighbours) # bind to a menu of unique genre values
)
gdf_projected = gdf.to_crs("EPSG:32610")  # You can replace EPSG:32610 with an appropriate EPSG code for your area
gdf_projected["centroid"] = gdf_projected["geometry"].centroid
gdf_projected["centroid"] = gdf_projected["centroid"].to_crs(gdf.crs)
gdf["centroid_lng"] = gdf_projected["centroid"].apply(lambda point: point.x)
gdf["centroid_lat"] = gdf_projected["centroid"].apply(lambda point: point.y)


data  = alt.InlineData(values = gdf.__geo_interface__, #geopandas to geojson
                       # root object type is "FeatureCollection" but we need its features
                       format = alt.DataFormat(property='features',type='json')) 
data


base = alt.Chart(data).mark_geoshape(
    stroke='black',
    strokeWidth=1
).add_selection(
    selectNeighbourhood
).encode(
    color=alt.Color("properties.count:Q", title='Incedents of Crime'),
    tooltip=[
        alt.Tooltip('properties.Neighbourhood:N', title='Neighbourhood'),
        alt.Tooltip('properties.count:Q', title='Incedents of Crime')
    ],
    opacity=alt.condition(selectNeighbourhood, alt.value(0.75), alt.value(0.25), legend=None)
).transform_calculate(
    Neighbourhood='datum.properties.Neighbourhood'
).project(
    type='identity', reflectY=True
)


text_chart = alt.Chart(data).mark_text(
    align='center',
    baseline='middle',
    fontSize=10,
    fontWeight="bold",
    dy=-8  # Adjust the y-offset of the text labels if necessary
).encode(
    longitude='properties.centroid_lng:Q',
    latitude='properties.centroid_lat:Q',
    text='properties.mapid:N',  # Use the 'mapid' column for text
    tooltip=[
        alt.Tooltip('properties.Neighbourhood:N', title='Neighbourhood'),
        alt.Tooltip('properties.count:Q', title='Incedents of Crime')
    ]
)
map_with_mapid = base + text_chart
map_with_mapid = map_with_mapid.properties(
    height=500,
    width=600,
    title="Distribution of Total Crime in Vancouver by Neighbourhood"
)
map_with_mapid
Out[8]:

View 4 - Trend of Crime Data by Type of Crime¶

Task(s) Addressed:¶

  1. How has the crime rate by crime type in Vancouver changed over time? (Trend)
In [9]:
# Used https://altair-viz.github.io/gallery/multiline_tooltip.html as a resource
# Discard year 2023 as it is not over
df = crime_data[crime_data['YEAR'] < 2023]

#Combine different TYPEs into a couple similar types
df = df.replace('Break and Enter Commercial','Break And Enter')
df = df.replace('Break and Enter Residential/Other','Break And Enter')
df = df.replace('Homicide','Offence Against a Person')
df = df.replace('Other Theft','Other Theft')
df = df.replace('Theft from Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Bicycle','Vehicle Related Theft')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Fatality)','Traffic Accident')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Injury)','Traffic Accident')
#Change Year Column to Temporal
df['YEAR'] = pd.to_datetime(df.YEAR, format='%Y')
In [10]:
df
Out[10]:
Unnamed: 0 Neighbourhood Total - Age groups and average age of the population - 100% data 0 to 14 years...3 0 to 4 years...4 5 to 9 years...5 10 to 14 years...6 15 to 64 years...7 15 to 19 years...8 20 to 24 years...9 ... MONTH DAY HOUR MINUTE HUNDRED_BLOCK X Y Population density Average cost of house in neighbour Average income
2 3 Dunbar-Southlands 21425.0 3545.0 675.0 1225.0 1650.0 14215.0 1800.0 1740.0 ... 11.0 15.0 14.0 30.0 29XX W 31ST AVE 487516.1816 5454623.638 NaN NaN NaN
3 4 Dunbar-Southlands 21425.0 3545.0 675.0 1225.0 1650.0 14215.0 1800.0 1740.0 ... 7.0 11.0 21.0 0.0 29XX W 31ST AVE 487579.6067 5454613.684 NaN NaN NaN
4 5 Dunbar-Southlands 21425.0 3545.0 675.0 1225.0 1650.0 14215.0 1800.0 1740.0 ... 4.0 25.0 21.0 54.0 29XX W 33RD AVE 487585.2638 5454405.082 NaN NaN NaN
5 6 Dunbar-Southlands 21425.0 3545.0 675.0 1225.0 1650.0 14215.0 1800.0 1740.0 ... 9.0 8.0 7.0 0.0 29XX W 33RD AVE 487585.2638 5454405.082 NaN NaN NaN
6 7 Dunbar-Southlands 21425.0 3545.0 675.0 1225.0 1650.0 14215.0 1800.0 1740.0 ... 12.0 2.0 7.0 54.0 29XX W 38TH AVE 487435.4586 5453876.477 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
853943 853944 Arbutus Ridge NaN NaN NaN NaN NaN NaN NaN NaN ... 11.0 9.0 10.0 23.0 YEW ST / KING EDWARD AVE 488652.0000 5455342.000 NaN NaN NaN
853944 853945 Arbutus Ridge NaN NaN NaN NaN NaN NaN NaN NaN ... 9.0 24.0 12.0 31.0 YEW ST / NANTON AVE 488739.0000 5454999.000 NaN NaN NaN
853945 853946 Arbutus Ridge NaN NaN NaN NaN NaN NaN NaN NaN ... 8.0 31.0 20.0 1.0 YEW ST / W 33RD AVE 488673.0000 5454386.000 NaN NaN NaN
853946 853947 Arbutus Ridge NaN NaN NaN NaN NaN NaN NaN NaN ... 1.0 23.0 18.0 16.0 YEW ST / W 37TH AVE 488470.0000 5453964.000 NaN NaN NaN
853947 853948 Arbutus Ridge NaN NaN NaN NaN NaN NaN NaN NaN ... 8.0 17.0 14.0 56.0 YEW ST / W 39TH AVE 488463.0000 5453756.000 NaN NaN NaN

848542 rows × 269 columns

In [11]:
# Find Neighbourhood Count
df = df.groupby(['YEAR', 'TYPE']).count()
df = df.rename(columns={"Unnamed: 0": "Count",})
df = pd.DataFrame(df.to_records())

df
Out[11]:
YEAR TYPE Count Neighbourhood Total - Age groups and average age of the population - 100% data 0 to 14 years...3 0 to 4 years...4 5 to 9 years...5 10 to 14 years...6 15 to 64 years...7 ... MONTH DAY HOUR MINUTE HUNDRED_BLOCK X Y Population density Average cost of house in neighbour Average income
0 2003-01-01 Break And Enter 10081 10081 8710 8710 8710 8710 8710 8710 ... 10081 10081 10081 10081 10081 10081 10081 0 0 0
1 2003-01-01 Mischief 6387 6387 4955 4955 4955 4955 4955 4955 ... 6387 6387 6387 6387 6387 6387 6387 0 0 0
2 2003-01-01 Offence Against a Person 3531 3529 2654 2654 2654 2654 2654 2654 ... 3531 3531 3531 3531 3531 3531 3531 0 0 0
3 2003-01-01 Other Theft 11426 11426 7969 7969 7969 7969 7969 7969 ... 11426 11426 11426 11426 11424 11426 11426 0 0 0
4 2003-01-01 Traffic Accident 1881 1849 1568 1568 1568 1568 1568 1568 ... 1881 1881 1881 1881 1881 1849 1849 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
115 2022-01-01 Mischief 5604 5604 3354 3354 3354 3354 3354 3354 ... 5604 5604 5604 5604 5604 5604 5604 0 0 0
116 2022-01-01 Offence Against a Person 3884 3882 2424 2424 2424 2424 2424 2424 ... 3884 3884 3884 3884 3884 3884 3884 0 0 0
117 2022-01-01 Other Theft 10731 10731 7272 7272 7272 7272 7272 7272 ... 10731 10731 10731 10731 10731 10731 10731 0 0 0
118 2022-01-01 Traffic Accident 1031 1028 839 839 839 839 839 839 ... 1031 1031 1031 1031 1031 1030 1030 0 0 0
119 2022-01-01 Vehicle Related Theft 9689 9689 6738 6738 6738 6738 6738 6738 ... 9689 9689 9689 9689 9689 9689 9689 0 0 0

120 rows × 269 columns

In [12]:
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection_single(nearest=True, on='mouseover',
                        fields=['YEAR'])

# # The basic line
line = alt.Chart(df).mark_line().encode(
    alt.X('YEAR:T', title = 'Year'),
    y='Count',
    color='TYPE:N'
)


# Transparent selectors across the chart. This is what tells us the x-value of the cursor
selectors = alt.Chart(df).mark_point().encode(
    x='YEAR:T',
    opacity=alt.value(0)

)

selectors_near = selectors.encode(
    opacity=alt.condition(nearest, alt.value(0.5), alt.value(0.3))
).add_selection(nearest)

# Draw points on the line, and highlight based on selection
points = line.mark_circle().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
).add_selection(nearest)

# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'Count:Q', alt.value(' '))
)

# Draw a rule at the location of the selection
rules = alt.Chart(df).mark_rule(color='gray').encode(
    x='YEAR:T',
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
layers = alt.layer(
    line, selectors, points, text, rules
).properties(
    width=600, height=300,
    title = "Total Crime in Vancouver Over Time by Crime Type"
)

layers
Out[12]:
In [ ]: